
Advantages:
Disadvantages:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as matplot
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
warnings.filterwarnings("ignore")
# Import the Titanic Dataset
X = pd.read_csv('titanic_train.csv')
X.shape
X.head()
y = X.pop("Survived")
y.shape
y.head()
# Drop PassengerId, Name, Ticket
X.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
def clean_cabin(x):
try:
return x[0]
except TypeError:
return "None"
# Clean Cabin
X["Cabin"] = X.Cabin.apply(clean_cabin)
# Impute missing age with median
X["Age"].fillna(X["Age"].median(), inplace=True)
# Define categorical features
categorical_variables = ["Sex", "Cabin", "Embarked"]
# Impute missing categorical variables and dummify them
for variable in categorical_variables:
X[variable].fillna("Missing", inplace=True)
dummies = pd.get_dummies(X[variable], prefix=variable)
X = pd.concat([X, dummies], axis=1)
X.drop([variable], axis=1, inplace=True)
X.shape
X.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Get a list of numerical features
numeric_variables = ['Pclass','Age','SibSp','Parch','Fare']
X_numeric = X_train[numeric_variables]
X_numeric.shape
X_numeric.head()
from sklearn.ensemble import RandomForestClassifier
# Create the baseline
model_1 = RandomForestClassifier(oob_score=True, random_state=42)
# Fit and Evaluate OOB
model_1 = model_1.fit(X_numeric, y_train)
# Calculate OOB Score
print("The OOB Score is: " + str(model_1.oob_score_))
from sklearn.model_selection import cross_val_score
# CV Score
rf_result = cross_val_score(model_1, X_numeric, y_train, scoring='accuracy')
rf_result.mean()
from sklearn.metrics import roc_auc_score
# AUC Score
pred_train = np.argmax(model_1.oob_decision_function_,axis=1)
rf_numeric_auc = roc_auc_score(y_train, pred_train)
rf_numeric_auc
# Copy the whole train set
X_cat = X_train.copy()
X_cat.shape
X_cat.head()
# Create the baseline
model_2 = RandomForestClassifier(oob_score=True, random_state=40)
# Fit and Evaluate OOB
model_2 = model_2.fit(X_cat, y_train)
# Calculate OOB Score
print("The OOB Score is: " + str(model_2.oob_score_))
rf_result = cross_val_score(model_2, X_cat, y_train, scoring='accuracy')
rf_result.mean()
# AUC Score
pred_train = np.argmax(model_2.oob_decision_function_,axis=1)
rf_cat_auc = roc_auc_score(y_train, pred_train)
rf_cat_auc
Does Scaling Affect Performance? Is it neccessary for tree based models?
from sklearn.preprocessing import StandardScaler
# Scaling training data
X_cat_scaled = StandardScaler().fit(X_cat).transform(X_cat)
X_cat_scaled
# Create the baseline
model_3= RandomForestClassifier(oob_score=True, random_state=40)
# Fit and Evaluate OOB
model_3 = model_3.fit(X_cat_scaled, y_train)
# Calculate OOB Score
model_3.oob_score_
from sklearn.metrics import roc_curve
# Create ROC Graph
rf_numeric_fpr, rf_numeric_tpr, rf_numeric_thresholds = roc_curve(y_test, model_1.predict_proba(X_test[X_numeric.columns])[:,1])
rf_cat_fpr, rf_cat_tpr, rf_cat_thresholds = roc_curve(y_test, model_2.predict_proba(X_test)[:,1])
# Plot Random Forest Numeric ROC
plt.plot(rf_numeric_fpr, rf_numeric_tpr, label='RF Numeric (area = %0.2f)' % rf_numeric_auc)
# Plot Random Forest Cat+Numeric ROC
plt.plot(rf_cat_fpr, rf_cat_tpr, label='RF Cat+Num (area = %0.2f)' % rf_cat_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Graph')
plt.legend(loc="lower right")
plt.show();
The more depth (deeper the tree) means the higher chance of overfitting
results = []
results2 = []
max_depth_size = [1,2,3,4,5,10,20,50,100]
for depth in max_depth_size:
model = RandomForestClassifier(depth, oob_score=True, n_jobs=-1, random_state=44)
#model.fit(X, y)
model.fit(X_train, y_train)
print(depth, 'depth')
pred = model.predict(X_train)
pred2 = model.predict(X_test)
roc1 = roc_auc_score(y_train, pred)
roc2 = roc_auc_score(y_test, pred2)
print('AUC Train: ', roc1)
print('AUC Test: ', roc2)
results.append(roc1)
results2.append(roc2)
print (" ")
plt.plot(max_depth_size, results, label='Train Set')
plt.plot(max_depth_size, results2, label='Test Set')
plt.xlabel('Max Depth Size')
plt.ylabel('AUC Score')
plt.title('Train VS Test Scores')
plt.legend(loc="lower right")
plt.show();
Generally the more trees the better. You'll generalize better with more trees and reduce the variance more. The only downside is computation time.
results = []
results2 = []
n_estimator_options = [1, 2, 3, 4, 5, 15, 20, 25, 40, 50, 70, 100]
for trees in n_estimator_options:
model = RandomForestClassifier(trees, oob_score=True, random_state=42)
model.fit(X_train, y_train)
print(trees, 'trees')
AUC = model.oob_score_
print('AUC: ', AUC)
results.append(AUC)
print (" ")
pd.Series(results, n_estimator_options).plot();
results = []
max_features_options = [0.7, 0.2, "auto", "sqrt", "log2"]
for max_features in max_features_options:
model = RandomForestClassifier(n_estimators=1000, oob_score=True, n_jobs=-1, random_state=42, max_features=max_features)
model.fit(X_train, y_train)
print(max_features, "option")
auc = model.oob_score_
print('AUC: ', auc)
results.append(auc)
print (" ")
pd.Series(results, max_features_options).plot();
results = []
min_samples_leaf_options = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,20]
for min_samples in min_samples_leaf_options:
model = RandomForestClassifier(n_estimators=1000, oob_score=True, n_jobs=-1, random_state=42, max_features="auto", min_samples_leaf=min_samples)
model.fit(X_train, y_train)
print(min_samples, "min samples")
auc = model.oob_score_
print('AUC: ', auc)
results.append(auc)
print (" ")
pd.Series(results, min_samples_leaf_options).plot();
# The amount of trees used to ensemble
model_2.estimators_
Xx = pd.read_csv('titanic_train.csv')
round(Xx.Survived.value_counts(1), 2)
# Load libraries
import pydotplus
from sklearn import tree
from sklearn import datasets
from IPython.display import Image
from sklearn.tree import DecisionTreeClassifier
m = RandomForestClassifier(n_estimators=1, max_depth=3)
m = m.fit(X_train, y_train)
# Create DOT data
dot_data = tree.export_graphviz(m.estimators_[0], out_file=None,
feature_names=X_train.columns,
class_names=True)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())
# Load libraries
import pydotplus
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
# Create DOT data
dot_data = StringIO()
export_graphviz(model_2.estimators_[0], out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# Show graph
Image(graph.create_png())
Feature selection - implicit feature selection whereas models like LASSO is explicit.
A feature’s importance is the increase in the model’s prediction error after we permuted the feature’s values (breaks the relationship between the feature and the outcome).
What does it mean: The important features are more correlated with the dependent variable and contribute more variance to the dependent varaible.
How does it work:
Note: Feature importance only gives you notion which features contributes to the decision, not "which way", because sometimes it will work one way, and sometimes the other way.
If you work with linear models then there is quite simple distinction if feature is "positive" or "negative", because the only impact it can have on the final result is being added (with weight).
However, ensemble of decision trees can have arbitrary complex rules for each feature, for example "if book has red cover and have more than 100 pages then if it contains dragons it gets high score" but "if book has blue cover and more than 100 pages then if it contains dragons it gets low score" and so on.
model_2.feature_importances_
feature_importances = pd.Series(model_2.feature_importances_, index=X.columns)
print(feature_importances)
feature_importances.sort_values(inplace=True)
feature_importances.plot(kind='barh', figsize=(7,6))
model_2.feature_importances_.max()
# Create function to combine feature importances
def graph_feature_importances(model, feature_names, autoscale=True, headroom=0.1, width=10, summarized_columns=None):
feature_dict=dict(zip(feature_names, model.feature_importances_))
if summarized_columns:
for col_name in summarized_columns:
sum_value = sum(x for i, x in feature_dict.items() if col_name in i )
keys_to_remove = [i for i in feature_dict.keys() if col_name in i ]
for i in keys_to_remove:
feature_dict.pop(i)
feature_dict[col_name] = sum_value
results = pd.Series(feature_dict, index=feature_dict.keys())
results.sort_values(inplace=True)
print(results)
results.plot(kind='barh', figsize=(width, len(results)/4), xlim=(0, .30))
# Create combined feature importances
graph_feature_importances(model_2, X.columns, summarized_columns=categorical_variables)
Random Forest :